set more off
set scheme s1color
set graph off

*** Required packages
/*
ssc install semean
ssc install egenmore 
ssc install geodist
ssc install winsor2
ssc install reghdfe
ssc install ftools
ssc install erepost
ssc install estout
ssc install rdrobust
ssc install spmap
ssc install shp2dta
net install lpdensity, from("https://raw.githubusercontent.com/nppackages/lpdensity/master/stata") replace
net install rddensity, from("https://raw.githubusercontent.com/rdpackages/rddensity/master/stata") replace

* The command DCdensity for the McCrary density test requires manual installation.
* Please see here: https://eml.berkeley.edu/~jmccrary/DCdensity/
*/

* Define the path where all data from the Harvard Dateverse dataset is stored and create an additional folder "results". The tables and figures will be stored in that folder
cd "C:\Users\esj\Desktop\08_Toxic Waste Management_tobeudpated\Replication"

********************************************************************************
*** Load & prepare facility-chemical-level data
use tri_nlrb_data.dta, clear

**** Specify sample
keep if casetype=="RC" // Keep only Representation Certification cases (workers vote to get unionized)
keep if nets==3 // Keep only non-missing obsevations for essential variables (NETS variables are deleted here, because data is proprietary)
keep if electionyear<=2017 & electionyear>=1990 // Start in 1991 after the Clean Air Act amendment in 1990. End in 2017 because TRI Sample until 2020. We measure effects up to three years post election.


* Create numerical variables to include as fixed effects
encode casrn, generate(ncasrn)
encode industry_highlev, gen(nindustry_highlev)
encode facility_state, generate(nfacility_state)


**** Define Main Dependent Variables
global allratios ratwaste_sum ratrelease_sum rattotal_onsite_releases rattotal_offsite_releases ratmanaged_sum rattotal_onsite_waste_man ratoffsite_waste_man ratonsite_air_rel ratonsite_water_rel ratonsite_land_rel

global ratios ratrelease_sum rattotal_onsite_releases rattotal_offsite_releases ratmanaged_sum rattotal_onsite_waste_man ratoffsite_waste_man

foreach v of varlist $allratios {
	replace `v'=. if `v'>3
	replace l`v'=. if `v'>3
}

*** Classify Union model (service or organizing model)
* service: International Brotherhood of Electrical Workers (IBEW), International Chemical Workers Union Council (ICWUC), Oil, Chemical and Atomic Workers International Union (OCAW),  International Union of Operating Engineers (IUOE)
* between: United Steelworkers (USW), Teamsters Union (International Brotherhood of Teamsters, IBT), International Association of Machinists and Aerospace Workers (IAM)
* organizing: United Auto Workers (UAW), Service Employees International Union (SEIU), Communications Workers of America (CWA), United Food and Commercial Workers (UFCW)

*clean uniontocertify
replace uniontocertify = strlower(uniontocertify)
duplicates tag uniontocertify, gen(tocheck)
sort tocheck

gen cleanunion = ""

replace cleanunion = "IBT" if strpos(uniontocertify,"ibt")>0
replace cleanunion = "IBT" if strpos(uniontocertify,"teamsters")>0

replace cleanunion = "USW" if strpos(uniontocertify,"usw")>0
replace cleanunion = "USW" if uniontocertify=="steel wkrs"
replace cleanunion = "USW" if strpos(uniontocertify,"united steel")>0

replace cleanunion = "UAW" if  strpos(uniontocertify,"uaw")>0
replace cleanunion = "UAW" if uniontocertify=="auto wkrs"
replace cleanunion = "UAW" if strpos(uniontocertify,"united automobile")>0

replace cleanunion = "ICWUC" if strpos(uniontocertify,"icwuc")>0
replace cleanunion = "ICWUC" if strpos(uniontocertify,"international chemical workers union council")>0
replace cleanunion = "ICWUC" if strpos(uniontocertify,"chemical wkrs")>0
replace cleanunion = "ICWUC" if strpos(uniontocertify,"chemical workers")>0

replace cleanunion = "UFCW" if strpos(uniontocertify,"ufcw")>0
replace cleanunion = "UFCW" if strpos(uniontocertify,"united food and commer wkrs")>0
replace cleanunion = "UFCW" if strpos(uniontocertify,"united food & commer wkrs")>0

replace cleanunion = "IBEW" if strpos(uniontocertify,"electrical workers")>0
replace cleanunion = "IBEW" if strpos(uniontocertify,"electrical wkrs")>0
replace cleanunion = "IBEW" if strpos(uniontocertify,"ibew")>0

replace cleanunion = "OCAW" if strpos(uniontocertify,"oil chem. & atomic wkrs")>0

replace cleanunion = "IAM" if strpos(uniontocertify,"iam")>0
replace cleanunion = "IAM" if strpos(uniontocertify,"international association of machinists and aerospace workers")>0
replace cleanunion = "IAM" if strpos(uniontocertify,"international association of machinists & aerospace workers")>0

replace cleanunion = "IUOE" if strpos(uniontocertify,"international union of operating engineers")>0
replace cleanunion = "IUOE" if strpos(uniontocertify,"iuoe")>0

gen unionmodel = ""
replace unionmodel = "service" if cleanunion == "IBEW" |cleanunion=="ICWUC" | cleanunion=="OCAW" | cleanunion=="IUOE" | cleanunion=="USW" | cleanunion=="IBT"
*replace unionmodel = "between" if cleanunion == "USW" |cleanunion=="IBT" | cleanunion=="IAM"
replace unionmodel = "organizing" if cleanunion == "UAW" | cleanunion == "SEIU" | cleanunion=="CWA" |cleanunion=="UFCW" | cleanunion=="IAM" 


*** Human Toxicity Potential
preserve
duplicates drop casrn cancer_air-noncancer_water, force
duplicates drop casrn, force // 1 duplicate, why?
keep casrn cancer_air-noncancer_water

gen cancer_htp=0
replace cancer_htp=1 if cancer_air!=. | cancer_water!=.

replace noncancer_air=999999 if missing(noncancer_air) & !missing(cancer_air)
replace noncancer_water=999999 if missing(noncancer_water) & !missing(cancer_water)
replace noncancer_air=0 if missing(noncancer_air)
replace noncancer_air=0 if missing(noncancer_air)

* Create dummy for dangerous half, i.e. largest htp noncancer_air value
gen air_htp=0
gsort -noncancer_air
gen x1=_n
egen y1=max(x1)
*replace air_htp=1 if x1<=150
replace air_htp=1 if x1/y1<0.5

* Create dummy for dangerous half, i.e. largest htp noncancer_water value
gen water_htp=0
gsort -noncancer_water
gen x2=_n
egen y2=max(x2)
*replace water_htp=1 if x2<=150
replace water_htp=1 if x2/y2<=0.5
drop x1 x2 y1 y2 cancer_air-noncancer_water

tempfile tox
save `tox'

restore
merge n:1 casrn using `tox', nogenerate

**** SAVE DATASET for waste management analyses
save tri_nlrb_sample.dta, replace


keep trifd latitude longitude nvshare voters city_samp case_number facility_state facility_city electionstate employerstate numberunions uniontocertify vshare vmargin year electionyear rtw industry_highlev industry T
duplicates drop case_number trifd year, force

save tri_nlrb_sample_short.dta, replace

*** ADD violation data
* Three files from ICIS FE&C Dataset (ZIP), downloaded from https://echo.epa.gov/tools/data-downloads#downloads
*Please note that the data quality prior to November of 2000 has not been assessed and should be considered unknown. You can find additional information about the data presented in ECHO here: https://echo.epa.gov/resources/echo-data/about-the-data. 

*** Load 
import delimited "violationdata/CASE_PROGRAMS.csv", clear 
tempfile programs
save `programs'

import delimited "violationdata/CASE_VIOLATIONS.csv", clear //activity_id not unique for both files
merge n:m activity_id using `programs'
drop _merge

tempfile provio
save `provio'

import delimited "violationdata/CASE_FACILITIES.csv", clear
destring activity_id, replace force
merge n:m activity_id using `provio'

keep if _merge==3
drop _merge
rename registry_id frs_facility_id
* TRI violations
*br if program_code=="EPCRATRI"
*case_number (gives date): values typically begin with either two characters (EPA region number, "HQ" (headquarters), "EF" (CAA Eastern Field Office), or "WF" (CAA Western Field Office), followed by the four digit fiscal year. In a majority of cases, this is the year in which the action was initiated. The fiscal year is followed by a four character sequence identifier. State CASE_NUMBER values do not follow any set format.

keep if program_code=="EPCRATRI" //check with EPA if this is correct
gen violationyear1=substr(case_number,4,4)
destring violationyear1, replace

sort frs_facility_id violationyear1

gen violationyear2=""
replace violationyear2 = substr(case_number,4,4) if frs_facility_id[_n-1]==frs_facility_id
destring violationyear2, replace

gen violationyear3=""
replace violationyear3 = substr(case_number,4,4) if frs_facility_id[_n-2]==frs_facility_id
destring violationyear3, replace

gen violationyear4=""
replace violationyear4 = substr(case_number,4,4) if frs_facility_id[_n-3]==frs_facility_id
destring violationyear4, replace

rename case_number vio_case_number


collapse (firstnm) violationyear1 violationyear2 violationyear3 violationyear4, by(frs_facility_id) 
*br if violationyear2!=violationyear1 & violationyear2!=.

replace violationyear4 =. if violationyear3==violationyear4
replace violationyear3 =. if violationyear3==violationyear2
replace violationyear2 =. if violationyear1==violationyear2
gen nrviolations = 1
replace nrviolations = 2 if violationyear2!=. | violationyear3!=. | violationyear4!=.
replace nrviolations = 3 if (violationyear3!=. & violationyear2!=.) | (violationyear3!=. & violationyear4!=.) | (violationyear2!=. & violationyear4!=.) 
replace nrviolations = 4 if violationyear4!=. & violationyear3!=. & violationyear2!=.

* Manual corrections (maximal 3 violations)
replace violationyear2=violationyear4 if violationyear4!=. //works because those don't have violationyear2
drop violationyear4

replace violationyear2=violationyear3 if nrviolations!=3 //works because those don't have violationyear2
replace violationyear3=. if nrviolations!=3
 

merge 1:n frs_facility_id using tri_nlrb_sample, gen(_merge_vio)
drop if _merge_vio==1 //those that we don't have in our union-matched TRI dataset 

save tri_nlrb_echo_sample, replace 
 
 

** already keep only elections that are relevant later. citysamp and voters, later delete duplicates trifd year, can have different elections in different years and cities -> don't know how to get to original numbers...


*** ADD Pollution Data for pollution analyses
* Make Panel from annual air pollution summaries to be downloaded from https://aqs.epa.gov/aqsweb/airdata/download_files.html

forvalues x = 1991/2020 {
unzipfile "airpollution/annual_conc_by_monitor_`x'"
import delimited "annual_conc_by_monitor_`x'", clear
tostring *, replace force
tempfile annual_conc_by_monitor_`x'
save `annual_conc_by_monitor_`x''
} 

unzipfile "airpollution/annual_conc_by_monitor_1990"
import delimited "annual_conc_by_monitor_1990", clear
tostring *, replace force
forvalues x = 1991/2020 {
append using `annual_conc_by_monitor_`x''
}

rename (statecode countycode sitenum latitude longitude) (m_statecode m_countycode m_sitenum m_latitude m_longitude) 
destring year, force replace
destring m_statecode m_countycode m_sitenum m_latitude m_longitude, replace force

save "airpollution/annual_conc_by_monitor_panel.dta", replace

keep m_statecode m_countycode m_sitenum m_latitude m_longitude year
gsort m_latitude m_longitude -year
*sitenum changes over years, unique with lat and long
duplicates drop m_latitude m_longitude, force

gen m_id = _n
save unique_pollution_monitors.dta, replace


*** Match to TRI-NLRB Data at the Facility Level
use tri_nlrb_sample_short.dta, clear
duplicates drop trifd, force
keep trifd latitude longitude
gen id = _n

cross using unique_pollution_monitors.dta

geodist latitude longitude m_latitude m_longitude, gen(d)
sort trifd d
by trifd, sort: egen mind=min(d)
save temp.dta, replace
format d % 9.0g
recast float d, force
keep if d==mind
duplicates tag trifd, gen(dup)
drop year mind
sum d, d
save trifd_m_id_geodistmatch.dta, replace

* don't need this here merge 1:m trifd using tri_nlrb_sample_short.dta, keep(3) gen(merge)

joinby m_latitude m_longitude using "airpollution/annual_conc_by_monitor_panel.dta" //without year



destring observationcount observationpercent completenessindicator validdaycount requireddaycount exceptionaldatacount nulldatacount primaryexceedancecount secondaryexceedancecount certificationindicator numobsbelowmdl arithmeticmean arithmeticstandarddev stmaxvalue ndmaxvalue stmaxdatetime ndmaxvalue ndmaxdatetime rdmaxvalue rdmaxdatetime thmaxvalue thmaxdatetime stmaxnonoverlappingvalue stnomaxdatetime ndmaxnonoverlappingvalue ndnomaxdatetime thpercentile v43 v44 v45 v46 v47 v48, replace force

*** Make comparable
* first check PMs: 
foreach v of varlist arithmeticmean arithmeticstandarddev stmaxvalue ndmaxvalue rdmaxvalue thmaxvalue stmaxnonoverlappingvalue ndmaxnonoverlappingvalue thpercentile v43 v44 v45 v46 v47 v48 {
	replace `v' = `v'/1000 if strpos(unitsofmeasure,"Nano")>0
	replace `v' = `v'*1000 if strpos(unitsofmeasure,"Parts per million")>0 & parametername!="Ozone" & parametername!="Carbon monoxide"
}
gen units = unitsofmeasure
replace units = "Micrograms/cubic meter" if strpos(unitsofmeasure,"Nano")>0
replace units = "Parts per billion" if strpos(unitsofmeasure,"Parts per million")>0 & parametername!="Ozone" & parametername!="Carbon monoxide"

drop if units=="Inverse Megameters"
winsor2 stmaxvalue arithmeticmean v47 primaryexceedancecount, replace

save trifd_pollution_data.dta, replace 


*** MAKE AIR QUALITY INDEX
*use all or AQI or only PM
* CALCULATE AQI (Air Quality Index)
* Ozone in ppm; checked! tab unitsofmeasure if parametername =="Ozone"
* Carbon monoxide in ppm; correct
* Sulfur dioxide in ppb; correct
* Nitrogen dioxide (NO2) in ppb; correct
* PM in micrograms/cubic meter; checked! tab units if strpos(parametername,"PM")>0
*reshape from long to wide (different chemicals in different columns)
keep if parametername =="Ozone" | parametername =="Carbon monoxide" |  parametername =="Sulfur dioxide" |  parametername =="Nitrogen dioxide (NO2)" | strpos(parametername,"PM")>0

gen parameter = ""
replace parameter = "Ozone"  if parametername =="Ozone"
replace parameter ="CO" if parametername =="Carbon monoxide"
replace parameter ="SO2" if parametername =="Sulfur dioxide"
replace parameter ="NO2" if  parametername =="Nitrogen dioxide (NO2)"
replace parameter = "PM25" if strpos(parametername,"PM2.5")>0
replace parameter = "PM10" if strpos(parametername,"PM10")>0

*mean over pm
*check metricused
by trifd year parameter, sort: egen value=mean(arithmeticmean) 
duplicates drop trifd year parameter, force
keep trifd year parameter value d //don't need units, know the units for each parameter
egen triyear = group(trifd year)
reshape wide value, i(triyear) j(parameter) string
 
*aqi is an index per pollutant. Index over all pollutants, is the worst pollutant-specific AQI
foreach v of varlist valueCO valueNO2 valueOzone valuePM10 valuePM25 valueSO2  {
gen aqi`v' =.
}
*make index upper boundary
gen aqiigood = 50
gen aqiimoderate = 100
gen aqiunhealthysen = 150
gen aqiiunhealthy = 200
gen aqiiveryunhealthy = 300
gen aqiihazardous = 500

* per pollutant: Ozone
replace aqivalueOzone = ((500-401)/(0.604-0.505))*(valueOzone-0.505)+401 if valueOzone > 0.504
replace aqivalueOzone = ((400-301)/(0.504-0.405))*(valueOzone-0.405)+301 if valueOzone <= 0.504
replace aqivalueOzone = ((300-201)/(0.2-0.106))*(valueOzone-0.106)+201 if valueOzone <= 0.2
replace aqivalueOzone = ((200-151)/(0.105-0.086))*(valueOzone-0.086)+151 if valueOzone <= 0.105
replace aqivalueOzone = ((150-101)/(0.085-0.071))*(valueOzone-0.071)+101 if valueOzone <= 0.085
replace aqivalueOzone = ((100-51)/(0.07-0.055))*(valueOzone-0.055)+51 if valueOzone <= 0.07
replace aqivalueOzone = ((50-0)/(0.054-0))*(valueOzone-0)+0 if valueOzone <= 0.054

* PM2.5
replace aqivaluePM25 = ((500-401)/(500.4-350.5))*(valuePM25-350.5)+401 if valuePM25 > 350.4
replace aqivaluePM25 = ((400-301)/(350.4-250.5))*(valuePM25-250.5)+301 if valuePM25 <= 350.4
replace aqivaluePM25 = ((300-201)/(250.4-150.5))*(valuePM25-150.5)+201 if valuePM25 <= 250.4
replace aqivaluePM25 = ((200-151)/(150.4-55.5))*(valuePM25-55.5)+151 if valuePM25 <= 150.4
replace aqivaluePM25 = ((150-101)/(55.4-35.5))*(valuePM25-35.5)+101 if valuePM25 <= 55.4
replace aqivaluePM25 = ((100-51)/(35.4-12.1))*(valuePM25-12.1)+51 if valuePM25 <= 35.4
replace aqivaluePM25 = ((50-0)/(12-0))*(valuePM25-0)+0 if valuePM25 <= 0.054

* PM10 
replace aqivaluePM10 = ((500-401)/(604-505))*(valuePM10-505)+401 if valuePM10 > 504
replace aqivaluePM10 = ((400-301)/(504-425))*(valuePM10-425)+301 if valuePM10 <= 504
replace aqivaluePM10 = ((300-201)/(424-355))*(valuePM10-355)+201 if valuePM10 <= 424
replace aqivaluePM10 = ((200-151)/(354-255))*(valuePM10-255)+151 if valuePM10 <= 354
replace aqivaluePM10 = ((150-101)/(254-155))*(valuePM10-155)+101 if valuePM10 <= 254
replace aqivaluePM10 = ((100-51)/(154-55))*(valuePM10-55)+51 if valuePM10 <= 154
replace aqivaluePM10 = ((50-0)/(54-0))*(valuePM10-0)+0 if valuePM10 <= 54

* CO 
replace aqivalueCO = ((500-401)/(50.4-40.5))*(valueCO-40.5)+401 if valueCO > 40.4
replace aqivalueCO = ((400-301)/(40.4-30.5))*(valueCO-30.5)+301 if valueCO <= 40.4
replace aqivalueCO = ((300-201)/(30.4-15.5))*(valueCO-15.5)+201 if valueCO <= 30.4
replace aqivalueCO = ((200-151)/(15.4-12.5))*(valueCO-12.5)+151 if valueCO <= 15.4
replace aqivalueCO = ((150-101)/(12.4-9.5))*(valueCO-9.5)+101 if valueCO <= 12.4
replace aqivalueCO = ((100-51)/(9.4-4.5))*(valueCO-4.5)+51 if valueCO <= 9.4
replace aqivalueCO = ((50-0)/(4.4-0))*(valueCO-0)+0 if valueCO <= 4.4

* SO2 
replace aqivalueSO2 = ((500-401)/(1004-805))*(valueSO2-805)+401 if valueSO2 > 804
replace aqivalueSO2 = ((400-301)/(804-605))*(valueSO2-605)+301 if valueSO2 <= 804
replace aqivalueSO2 = ((300-201)/(604-305))*(valueSO2-305)+201 if valueSO2 <=604
replace aqivalueSO2 = ((200-151)/(304-186))*(valueSO2-186)+151 if valueSO2 <= 304
replace aqivalueSO2 = ((150-101)/(185-76))*(valueSO2-76)+101 if valueSO2 <= 185
replace aqivalueSO2 = ((100-51)/(75-36))*(valueSO2-36)+51 if valueSO2 <= 75
replace aqivalueSO2 = ((50-0)/(35-0))*(valueSO2-0)+0 if valueSO2 <= 35

* NO2
replace aqivalueNO2 = ((500-401)/(2049-1650))*(valueNO2-1650)+401 if valueNO2 > 1649
replace aqivalueNO2 = ((400-301)/(1649-1250))*(valueNO2-1250)+301 if valueNO2 <= 1649
replace aqivalueNO2 = ((300-201)/(1249-650))*(valueNO2-650)+201 if valueNO2 <= 1249
replace aqivalueNO2 = ((200-151)/(649-361))*(valueNO2-361)+151 if valueNO2 <= 649
replace aqivalueNO2 = ((150-101)/(360-101))*(valueNO2-101)+101 if valueNO2 <= 360
replace aqivalueNO2 = ((100-51)/(100-54))*(valueNO2-54)+51 if valueNO2 <= 100
replace aqivalueNO2 = ((50-0)/(53-0))*(valueNO2-0)+0 if valueNO2 <= 53

egen aqimean = rowmean(aqivalueCO aqivalueNO2 aqivalueOzone aqivaluePM10 aqivaluePM25 aqivalueSO2)
egen aqione = rowmax(aqivalueCO aqivalueNO2 aqivalueOzone aqivaluePM10 aqivaluePM25 aqivalueSO2)
sum aqione aqimean, d

duplicates tag trifd year, gen(duptest) 
*br trifd year aqimean aqione duptest
duplicates drop trifd year, force

save trifd_pollution_aqi_temp.dta, replace 
use trifd_pollution_aqi_temp, clear

merge 1:n trifd year using tri_nlrb_sample_short, keep(3)

*** Winsorize
winsor2 aqimean, replace
winsor2 aqione, replace


***** before dropping trifd year duplicates, keep the observations that we need (in case more than one election per facility)

gen d_miles = d*0.621371
keep if d_miles<5  //max 5 miles pollution monitor from tri facility

egen uniqueid = group(trifd case_number) 
xtset uniqueid year

foreach var of varlist aqione aqimean {
    gen rat`var' = `var'/l.`var'
	replace rat`var'=. if rat`var'>3 | rat`var'<0
}

label variable aqione "Air Quality Index (AQI)"
label variable rataqione "AQI ratio"

save tri_nlrb_pollution_aqi_sample, replace


* MERGE WASTE MANAGEMENT AND AIR POLLUTION DATA 
use tri_nlrb_sample.dta, clear 
merge n:1 trifd case_number year using tri_nlrb_pollution_aqi_sample, gen(mergedata)
save tri_nlrb_pollution_sample, replace


****************** DONE
